{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Univariate Feature Selection\n\n\nAn example showing univariate feature selection.\n\nNoisy (non informative) features are added to the iris data and\nunivariate feature selection is applied. For each feature, we plot the\np-values for the univariate feature selection and the corresponding\nweights of an SVM. We can see that univariate feature selection\nselects the informative features and that these have larger SVM weights.\n\nIn the total set of features, only the 4 first ones are significant. We\ncan see that they have the highest score with univariate feature\nselection. The SVM assigns a large weight to one of these features, but also\nSelects many of the non-informative features.\nApplying univariate feature selection before the SVM\nincreases the SVM weight attributed to the significant features, and will\nthus improve classification.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "print(__doc__)\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler\nfrom sklearn.svm import LinearSVC\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.feature_selection import SelectKBest, f_classif\n\n# #############################################################################\n# Import some data to play with\n\n# The iris dataset\nX, y = load_iris(return_X_y=True)\n\n# Some noisy data not correlated\nE = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))\n\n# Add the noisy data to the informative features\nX = np.hstack((X, E))\n\n# Split dataset to select feature and evaluate the classifier\nX_train, X_test, y_train, y_test = train_test_split(\n        X, y, stratify=y, random_state=0\n)\n\nplt.figure(1)\nplt.clf()\n\nX_indices = np.arange(X.shape[-1])\n\n# #############################################################################\n# Univariate feature selection with F-test for feature scoring\n# We use the default selection function to select the four\n# most significant features\nselector = SelectKBest(f_classif, k=4)\nselector.fit(X_train, y_train)\nscores = -np.log10(selector.pvalues_)\nscores /= scores.max()\nplt.bar(X_indices - .45, scores, width=.2,\n        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',\n        edgecolor='black')\n\n# #############################################################################\n# Compare to the weights of an SVM\nclf = make_pipeline(MinMaxScaler(), LinearSVC())\nclf.fit(X_train, y_train)\nprint('Classification accuracy without selecting features: {:.3f}'\n      .format(clf.score(X_test, y_test)))\n\nsvm_weights = np.abs(clf[-1].coef_).sum(axis=0)\nsvm_weights /= svm_weights.sum()\n\nplt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',\n        color='navy', edgecolor='black')\n\nclf_selected = make_pipeline(\n        SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()\n)\nclf_selected.fit(X_train, y_train)\nprint('Classification accuracy after univariate feature selection: {:.3f}'\n      .format(clf_selected.score(X_test, y_test)))\n\nsvm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)\nsvm_weights_selected /= svm_weights_selected.sum()\n\nplt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,\n        width=.2, label='SVM weights after selection', color='c',\n        edgecolor='black')\n\n\nplt.title(\"Comparing feature selection\")\nplt.xlabel('Feature number')\nplt.yticks(())\nplt.axis('tight')\nplt.legend(loc='upper right')\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}